###############################################################################-
# Author:  Pietryka
# Contact: matthew.pietryka@gmail.com
# Purpose: summarize the results of the permutations
# Notes:   
###############################################################################-

#  1. Load Packages  =====================

library(dplyr)
library(tidyr)
library(readr)
library(purrr)
library(glue)


# 2. load data ===============

# student data
indiv18_df <- read_rds("data-files/indiv18_df.rds")

# dyad data
observed_dyads_df <- read_rds("data-files/observed_dyads_df.rds")

# simulated dyads (unconstrained)
replicates_unconstrained_df <- 
  read_rds("data-files/replicates_unconstrained_df.rds")

# simulated dyads (constrained at hall level)
replicates_df <- read_rds("data-files/replicates_df.rds")


# 3. add outcomes to permutations ===============================

sim_unconstrained_df <- replicates_unconstrained_df %>% 
  ungroup()  %>% 
  select(sim_id, anon_id1, anon_id2)  %>% 
  left_join(indiv18_df, by = c("anon_id1"  = "anon_id"))  %>% 
  left_join(indiv18_df, by = c("anon_id2" = "anon_id"), 
            suffix = c("_f", "_rm")) 

sim_df <- replicates_df   %>% 
  ungroup()  %>% 
  select(sim_id, anon_id1, anon_id2)  %>% 
  left_join(indiv18_df, by = c("anon_id1"  = "anon_id"))  %>% 
  left_join(indiv18_df, by = c("anon_id2" = "anon_id"), 
            suffix = c("_f", "_rm")) 


# 4. summarize outcome variables for simulated dyads ================


mean_estimates <- function(df, ...){
  summarise(
    df, 
    prop_identical_2016 = mean(turnout_2016_f == turnout_2016_rm),
    prop_identical_2018 = mean(turnout_2018_f == turnout_2018_rm),
    prop_identical_2020 = mean(turnout_2020_f == turnout_2020_rm),
    prop_identical_parents_2012 = mean(parents_turnout_20120_f == parents_turnout_20120_rm),
    prop_identical_parents_2014 = mean(parents_turnout_20140_f == parents_turnout_20140_rm),
    n_identical = mean((turnout_2016_f == turnout_2016_rm) + (turnout_2018_f == turnout_2018_rm) + (turnout_2020_f == turnout_2020_rm)),
    cor_parents_turnout_mean0 = cor(parents_turnout_mean0_f,  parents_turnout_mean0_rm, use = "pairwise.complete.obs"),
    ...
  )  
}


# 5. summarize outcome variables in observed data ================


# pooling the genders

observed_mean_df <- observed_dyads_df  %>% 
  mean_estimates() %>% 
  pivot_longer(everything(), values_to = "observed_estimate")

estimate_names <- observed_mean_df$name

# grouped by gender

observed_gender_df <- observed_dyads_df  %>% 
  group_by(gender_f)  %>% 
  mean_estimates() %>% 
  pivot_longer(all_of(estimate_names), values_to = "observed_estimate")


## 5A. pooling the genders ------------------------

sim_unconstrained_mean_df <- sim_unconstrained_df %>% 
  group_by(sim_id)  %>% 
  mean_estimates() %>% 
  pivot_longer(
    all_of(estimate_names), 
    values_to = "sim_unconstrained_estimate"
  ) 


sim_mean_df <- sim_df %>% 
  group_by(sim_id)  %>% 
  mean_estimates() %>% 
  pivot_longer(
    all_of(estimate_names), 
    values_to = "sim_estimate")   %>% 
  left_join(sim_unconstrained_mean_df)



## 5B. estimates, grouped by gender  --------------


sim_gender_unconstrained_mean_df <- sim_unconstrained_df %>% 
  group_by(sim_id, gender_f)  %>% 
  mean_estimates() %>% 
  pivot_longer(
    all_of(estimate_names), 
    values_to = "sim_unconstrained_estimate"
  ) 


sim_gender_mean_df <- sim_df %>% 
  group_by(sim_id, gender_f)  %>% 
  mean_estimates() %>% 
  pivot_longer(
    all_of(estimate_names), 
    values_to = "sim_estimate")   %>% 
  left_join(sim_gender_unconstrained_mean_df)



# 6. compare observed and simulated outcomes ==========================

# function to extract estimates
extract_estimates <- function(df){
  df %>% 
    mutate(
      sim_estimate_mean = map_dbl(sims, ~mean(.x$sim_estimate)),
      sim_estimate_unconstrained_mean = map_dbl(
        sims, 
        ~mean(.x$sim_unconstrained_estimate)
      ),
      diff_estimate = observed_estimate - sim_estimate_mean,
      diff_unconstrained_estimate = observed_estimate - sim_estimate_unconstrained_mean,
      sim_estimate_sd = map_dbl(sims, ~sd(.x$sim_estimate)),
      sim_estimate_unconstrained_sd = map_dbl(
        sims, 
        ~sd(.x$sim_unconstrained_estimate)
      ),
      p_value = map2_dbl(
        observed_estimate, sims , 
        ~(1 - mean(.x  > .y$sim_estimate))
      ),
      p_value_unconstrained = map2_dbl(
        observed_estimate, 
        sims , 
        ~(1 - mean(.x  > .y$sim_unconstrained_estimate)
        )),
      p_lab = glue("p = {round(p_value, 2)}"),
      p_lab_unconstrained = glue("p = {round(p_value_unconstrained, 2)}"),
      diff_sims = map2(observed_estimate, sims , ~.x - .y$sim_estimate),
      diff_unconstrained_sims = map2(
        observed_estimate, sims ,
        ~.x - .y$sim_unconstrained_estimate
      ),
      cutoff = map_dbl(sims, ~quantile(.x$sim_estimate, probs = 0.95)),
      cutoff_unconstrained = map_dbl(
        sims, 
        ~quantile(.x$sim_unconstrained_estimate, probs = 0.95)
      ),
      hist_sims = map2(
        sims, 
        cutoff, 
        ~density(.x$sim_estimate, bw = "nrd0", adjust = 1.5) %$% 
          data.frame(x = x, y = y) %>% 
          mutate(area = x >= .y)
      ), 
      hist_sims_unconstrained = map2(
        sims, 
        cutoff_unconstrained, 
        ~density(
          .x$sim_unconstrained_estimate, bw = "nrd0", adjust = 1.0
        ) %$% 
          data.frame(x = x, y = y) %>%
          mutate(area = x >= .y)
        
      )
    ) 
}


## 6A. estimates, pooling the genders --------------

estimates_df <- observed_mean_df  %>% 
  full_join(sim_mean_df)  %>% 
  group_by(name, observed_estimate)  %>% 
  nest(sims = c(sim_id , sim_estimate, sim_unconstrained_estimate))  %>% 
  ungroup()  %>% 
  extract_estimates()




## 6B. estimates, grouped by gender  --------------


estimates_gender_df <- observed_gender_df  %>% 
  full_join(sim_gender_mean_df)  %>% 
  group_by(gender_f, name, observed_estimate)  %>% 
  nest(sims = c(sim_id , sim_estimate, sim_unconstrained_estimate))  %>% 
  ungroup()   %>% 
  extract_estimates()



gender_diff_df <- estimates_gender_df  %>% 
  select(gender_f, name, diff_sims, diff_unconstrained_sims)  %>% 
  pivot_wider(names_from = gender_f, values_from = c(diff_sims, diff_unconstrained_sims))  %>% 
  mutate(
    female_minus_male = map2(diff_sims_female, diff_sims_male , ~.x - .y),
    female_minus_male_unconstrained = map2(diff_unconstrained_sims_female , diff_unconstrained_sims_male , ~.x - .y)
  )  %>% 
  mutate(
    female_minus_male_mean = map_dbl(female_minus_male, mean),
    female_minus_male_p = map2_dbl(diff_sims_female, diff_sims_male, ~1 - mean(.x > .y)),
    female_minus_male_unconstrained_mean = map_dbl(female_minus_male_unconstrained, mean),
    female_minus_male_unconstrained_p = map2_dbl(diff_unconstrained_sims_female, diff_unconstrained_sims_male, ~1 - mean(.x > .y))
  )  %>% 
  select(name, where(is.double))




# 7 Save ==============================

write_rds(estimates_df, "data-files/estimates_df.rds")
write_rds(estimates_gender_df, "data-files/estimates_gender_df.rds")
write_rds(gender_diff_df, "data-files/gender_diff_df.rds")

